# Import standard libraries
import math
import os
import warnings
from pathlib import Path
# Import third-party libraries
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm.auto import tqdm
from IPython.display import display, HTML
# Function to install a library if it's not already installed
def install_library(library_name):
try:
__import__(library_name)
except ImportError:
!pip install {library_name}
# Install 'humanize' if it's not present
install_library('humanize')
# Install 'statsmodels' if it's not present
install_library('statsmodels')
# Import from local modules
from src.utils.general import LogTime
# Set seed for reproducibility
np.random.seed(42)
# Enable progress apply for pandas with tqdm
tqdm.pandas()
# To do feature engineering, read the already split files (train, test, val) to combine them.
try:
# Attempt to read the parquet files into pandas dataframes
train_df = pd.read_parquet("train.parquet")
val_df = pd.read_parquet("val.parquet")
test_df = pd.read_parquet("test.parquet")
except FileNotFoundError:
# If the files are not found, display an HTML warning message
display(HTML("""
<div style='background-color: #ffcccb; padding: 10px; border-left: 6px solid red;'>
<strong>Warning!</strong> File(s) not found. Please download them from Week 5 module into the same folder as this notebook.
</div>
"""))
# Add a column to each DataFrame to label the type of data
train_df["type"] = "train"
val_df["type"] = "val"
test_df["type"] = "test"
# Combine the train, validation, and test dataframes
combined_df = pd.concat([train_df, val_df, test_df])
# Sort the combined DataFrame by 'LCId' and 'timestamp' columns
combined_df.sort_values(["LCLid","timestamp"], inplace=True)
# Delete the individual dataframes to free up memory
del train_df, val_df, test_df
# Check the first five rows of the combined DataFrame to verify it looks correct
combined_df.head(5)
| timestamp | LCLid | energy_consumption | frequency | series_length | stdorToU | Acorn | Acorn_grouped | file | holidays | ... | temperature | dewPoint | pressure | apparentTemperature | windSpeed | precipType | icon | humidity | summary | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012-01-01 00:00:00 | MAC000061 | 0.114 | 30min | 37872 | Std | ACORN-Q | Adversity | block_96 | NO_HOLIDAY | ... | 12.12 | 10.97 | 1008.099976 | 12.12 | 5.90 | rain | partly-cloudy-night | 0.93 | Mostly Cloudy | train |
| 1 | 2012-01-01 00:30:00 | MAC000061 | 0.113 | 30min | 37872 | Std | ACORN-Q | Adversity | block_96 | NO_HOLIDAY | ... | 12.12 | 10.97 | 1008.099976 | 12.12 | 5.90 | rain | partly-cloudy-night | 0.93 | Mostly Cloudy | train |
| 2 | 2012-01-01 01:00:00 | MAC000061 | 0.113 | 30min | 37872 | Std | ACORN-Q | Adversity | block_96 | NO_HOLIDAY | ... | 12.59 | 11.02 | 1007.880005 | 12.59 | 6.06 | rain | cloudy | 0.90 | Overcast | train |
| 3 | 2012-01-01 01:30:00 | MAC000061 | 0.098 | 30min | 37872 | Std | ACORN-Q | Adversity | block_96 | NO_HOLIDAY | ... | 12.59 | 11.02 | 1007.880005 | 12.59 | 6.06 | rain | cloudy | 0.90 | Overcast | train |
| 4 | 2012-01-01 02:00:00 | MAC000061 | 0.060 | 30min | 37872 | Std | ACORN-Q | Adversity | block_96 | NO_HOLIDAY | ... | 12.45 | 11.04 | 1007.950012 | 12.45 | 5.31 | rain | partly-cloudy-night | 0.91 | Mostly Cloudy | train |
5 rows × 22 columns
# Import libraries
import numpy as np
# Try to import 'jit' from 'numba', install 'numba' if not present
try:
from numba import jit
except ImportError:
import sys
!{sys.executable} -m pip install numba
from numba import jit
# Import the 'add_lags' function from the local module, ensure 'src' is accessible
try:
from src.feature_engineering.autoregressive_features import add_lags
except ImportError as e:
print(f"Error importing add_lags: {e}")
# Define lags using numpy to create a list of integers
lags = (
(np.arange(5) + 1).tolist() +
(np.arange(5) + 46).tolist() +
(np.arange(5) + (48 * 7) - 2).tolist()
)
# Output the lags
print(lags)
[1, 2, 3, 4, 5, 46, 47, 48, 49, 50, 334, 335, 336, 337, 338]
with LogTime():
combined_df, added_features = add_lags(
combined_df, lags = lags, column = "energy_consumption", ts_id = "LCLid", use_32_bit = True
)
print(f"Featuresk Created: {','.join(added_features)}")
Time Elapsed: 0 microseconds Featuresk Created: energy_consumption_lag_1,energy_consumption_lag_2,energy_consumption_lag_3,energy_consumption_lag_4,energy_consumption_lag_5,energy_consumption_lag_46,energy_consumption_lag_47,energy_consumption_lag_48,energy_consumption_lag_49,energy_consumption_lag_50,energy_consumption_lag_334,energy_consumption_lag_335,energy_consumption_lag_336,energy_consumption_lag_337,energy_consumption_lag_338
from src.feature_engineering.autoregressive_features import add_rolling_features
with LogTime():
combined_df, added_features = add_rolling_features(
combined_df,
rolls=[3, 6, 12, 48],
column="energy_consumption",
agg_funcs=["mean", "std"],
ts_id="LCLid",
use_32_bit=True,
)
print(f"Features Created: {', '.join(added_features)}")
Time Elapsed: 5 seconds Features Created: energy_consumption_rolling_3_mean, energy_consumption_rolling_3_std, energy_consumption_rolling_6_mean, energy_consumption_rolling_6_std, energy_consumption_rolling_12_mean, energy_consumption_rolling_12_std, energy_consumption_rolling_48_mean, energy_consumption_rolling_48_std
from src.feature_engineering.autoregressive_features import (
add_seasonal_rolling_features,
)
with LogTime():
combined_df, added_features = add_seasonal_rolling_features(
combined_df,
rolls=[3],
seasonal_periods=[48, 48],
column="energy_consumption",
agg_funcs=["mean", "std"],
ts_id="LCLid",
use_32_bit=True,
)
print(f"Features Created: {', '.join(added_features)}")
Time Elapsed: 3 seconds Features Created: energy_consumption_48_seasonal_rolling_3_mean, energy_consumption_48_seasonal_rolling_3_std, energy_consumption_48_seasonal_rolling_3_mean, energy_consumption_48_seasonal_rolling_3_std
import numpy as np
import pandas as pd
import plotly.express as px
import math
t = np.arange(25).tolist()
plot_df = pd.DataFrame({"Timesteps behind t": t})
for alpha in [0.3, 0.5, 0.8]:
weights = [alpha * math.pow((1 - alpha), i) for i in t]
span = (2 - alpha) / alpha
half_life = math.log(1 / 2) / math.log(1 - alpha)
plot_df[f'Alpha={alpha} | Span={span:.2f}'] = weights
fig = px.line(
pd.melt(plot_df, id_vars="Timesteps behind t", var_name="Parameters"),
x="Timesteps behind t",
y="value",
facet_col="Parameters"
)
fig.update_layout(
autosize=False,
width=1200,
height=500,
yaxis=dict(
title_text="Weights",
titlefont=dict(size=15),
tickfont=dict(size=15),
),
xaxis=dict(
titlefont=dict(size=15),
tickfont=dict(size=15),
),
)
fig.update_annotations(font=dict(size=16))
fig.show()
from src.feature_engineering.autoregressive_features import add_ewma
with LogTime():
combined_df, added_features = add_ewma(
combined_df,
spans=[48 * 60, 48 * 7, 48],
column="energy_consumption",
ts_id="LCLid",
use_32_bit=True,
)
print(f"Features Created: {', '.join(added_features)}")
Time Elapsed: 1 second Features Created: energy_consumption_ewma_span_2880, energy_consumption_ewma_span_336, energy_consumption_ewma_span_48
from src.feature_engineering.temporal_features import add_temporal_features
with LogTime():
combined_df, added_features = add_temporal_features(
combined_df,
field_name="timestamp",
frequency="30min",
add_elapsed=True,
drop=False,
use_32_bit=True,
)
print(f"Features Created: {', '.join(added_features)}")
Time Elapsed: 1 second Features Created: timestamp_Month, timestamp_Quarter, timestamp_Is_quarter_end, timestamp_Is_quarter_start, timestamp_Is_year_end, timestamp_Is_year_start, timestamp_Is_month_start, timestamp_WeekDay, timestamp_Dayofweek, timestamp_Dayofyear, timestamp_Hour, timestamp_Minute, timestamp_Elapsed
from src.feature_engineering.temporal_features import (
add_fourier_features,
bulk_add_fourier_features,
)
with LogTime():
combined_df, added_features = bulk_add_fourier_features(
combined_df,
["timestamp_Month", "timestamp_Hour", "timestamp_Minute"],
max_values=[12, 24, 60],
n_fourier_terms=5,
use_32_bit=True,
)
print(f"Features Created: {', '.join(added_features)}")
Time Elapsed: 3 seconds Features Created: timestamp_Month_sin_1, timestamp_Month_sin_2, timestamp_Month_sin_3, timestamp_Month_sin_4, timestamp_Month_sin_5, timestamp_Month_cos_1, timestamp_Month_cos_2, timestamp_Month_cos_3, timestamp_Month_cos_4, timestamp_Month_cos_5, timestamp_Hour_sin_1, timestamp_Hour_sin_2, timestamp_Hour_sin_3, timestamp_Hour_sin_4, timestamp_Hour_sin_5, timestamp_Hour_cos_1, timestamp_Hour_cos_2, timestamp_Hour_cos_3, timestamp_Hour_cos_4, timestamp_Hour_cos_5, timestamp_Minute_sin_1, timestamp_Minute_sin_2, timestamp_Minute_sin_3, timestamp_Minute_sin_4, timestamp_Minute_sin_5, timestamp_Minute_cos_1, timestamp_Minute_cos_2, timestamp_Minute_cos_3, timestamp_Minute_cos_4, timestamp_Minute_cos_5
import pandas as pd
import plotly.express as px
import numpy as np
# Assume combined_df is a previously defined DataFrame
plot_df = (
combined_df[["timestamp_Month", "timestamp_Month_sin_1"]]
.drop_duplicates()
.sort_values("timestamp_Month")
)
plot_df.columns = ["calendar", "fourier"]
plot_df1 = pd.concat([plot_df, plot_df]).reset_index(drop=True)
plot_df1.reset_index(inplace=True)
plot_df1["index"] += 1
plot_df_melted = pd.melt(
plot_df1, id_vars="index", var_name="month", value_name="Representation"
)
fig = px.line(
plot_df_melted,
x="index",
y="Representation",
facet_row="month"
)
fig.update_layout(
autosize=False,
width=900,
height=800,
title_text="Step Function vs Continuous Function",
title={
"x": 0.5,
"xanchor": "center",
"yanchor": "top"
},
titlefont={"size": 20},
legend_title=None,
xaxis=dict(
title_text="Time",
titlefont={"size": 20},
)
)
fig.update_yaxes(matches=None)
fig.update_xaxes(
ticktext=np.arange(1, 13).tolist() * 3,
tickvals=np.arange(len(plot_df_melted)) + 1,
)
fig.show()
#Check the columns (features added)
combined_df.columns
Index(['timestamp', 'LCLid', 'energy_consumption', 'frequency',
'series_length', 'stdorToU', 'Acorn', 'Acorn_grouped', 'file',
'holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
'humidity', 'summary', 'type', 'energy_consumption_lag_1',
'energy_consumption_lag_2', 'energy_consumption_lag_3',
'energy_consumption_lag_4', 'energy_consumption_lag_5',
'energy_consumption_lag_46', 'energy_consumption_lag_47',
'energy_consumption_lag_48', 'energy_consumption_lag_49',
'energy_consumption_lag_50', 'energy_consumption_lag_334',
'energy_consumption_lag_335', 'energy_consumption_lag_336',
'energy_consumption_lag_337', 'energy_consumption_lag_338',
'energy_consumption_rolling_3_mean', 'energy_consumption_rolling_3_std',
'energy_consumption_rolling_6_mean', 'energy_consumption_rolling_6_std',
'energy_consumption_rolling_12_mean',
'energy_consumption_rolling_12_std',
'energy_consumption_rolling_48_mean',
'energy_consumption_rolling_48_std',
'energy_consumption_48_seasonal_rolling_3_mean',
'energy_consumption_48_seasonal_rolling_3_std',
'energy_consumption_ewma_span_2880', 'energy_consumption_ewma_span_336',
'energy_consumption_ewma_span_48', 'timestamp_Month',
'timestamp_Quarter', 'timestamp_Is_quarter_end',
'timestamp_Is_quarter_start', 'timestamp_Is_year_end',
'timestamp_Is_year_start', 'timestamp_Is_month_start',
'timestamp_WeekDay', 'timestamp_Dayofweek', 'timestamp_Dayofyear',
'timestamp_Hour', 'timestamp_Minute', 'timestamp_Elapsed',
'timestamp_Month_sin_1', 'timestamp_Month_sin_2',
'timestamp_Month_sin_3', 'timestamp_Month_sin_4',
'timestamp_Month_sin_5', 'timestamp_Month_cos_1',
'timestamp_Month_cos_2', 'timestamp_Month_cos_3',
'timestamp_Month_cos_4', 'timestamp_Month_cos_5',
'timestamp_Hour_sin_1', 'timestamp_Hour_sin_2', 'timestamp_Hour_sin_3',
'timestamp_Hour_sin_4', 'timestamp_Hour_sin_5', 'timestamp_Hour_cos_1',
'timestamp_Hour_cos_2', 'timestamp_Hour_cos_3', 'timestamp_Hour_cos_4',
'timestamp_Hour_cos_5', 'timestamp_Minute_sin_1',
'timestamp_Minute_sin_2', 'timestamp_Minute_sin_3',
'timestamp_Minute_sin_4', 'timestamp_Minute_sin_5',
'timestamp_Minute_cos_1', 'timestamp_Minute_cos_2',
'timestamp_Minute_cos_3', 'timestamp_Minute_cos_4',
'timestamp_Minute_cos_5'],
dtype='object')